import linecache
import logging

import gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


def load_text(url):
    text = []
    file = linecache.getlines(url)
    # file = linecache.getlines('./data/train.dat')
    for line in file:
        text.append(line.split())
    return text


def train():
    text = load_text('out/corpus_no_stopwords.txt')
    dictionary = gensim.corpora.Dictionary(text)
    corpus = [dictionary.doc2bow(t) for t in text]
    hdp = gensim.models.HdpModel(corpus=corpus, id2word=dictionary)
    hdp.save('hdp_model')


if __name__ == '__main__':
    train()
